In house data

Objective

  • Clean up our in house data. The NextSeq run and NovaSeq run contains technical replicates. We would like to remove the tech rep.
  • Prepare data for PAGA analysis

Conclusion

Set-up

Load required library

library(scater)
library(Seurat)
library(patchwork)
library(ggplot2)
library(dplyr)
library(reshape2)
library(ggbeeswarm)
library(gghighlight)
library(ggrepel)
library(viridis)
library(ggforce)

library(paletteer)
# Scater
library(scater)
library(scran)
library(dendextend)
library(dynamicTreeCut)

Specify the work space

workDir <- getwd()
workDir <- gsub("/scripts", "", workDir)
dataDir <- file.path(workDir, "data")
outDir <- file.path(workDir, "output")

# out data
outData_dir <- file.path(outDir, "out_data")

outDir_cur <- file.path(outDir, "Chapter_03_InHouse_YS")
dir.create(outDir_cur)
## Warning in dir.create(outDir_cur): '/Users/zfadlullah/Dropbox (The University
## of Manchester)/zaki/PhD/sequencing_runs/2021/2021_06_01_MZ_016/output/
## Chapter_03_InHouse_YS' already exists
set.seed(123)

source(file.path(workDir, "functions.R"))

Load the integrated object

load(file=file.path(outData_dir, "ForIntegration_InHouse_Public.Rdata"))

We have read the original single cell exp object

se_next <- readRDS(file.path(outData_dir, "Se_Next_allGenes_ENSID.RDS"))
se_nova <- readRDS(file.path(outData_dir, "Se_Nova_allGenes_ENSID.RDS"))

Load INDEX sort data

# We know all cell sorted in this plate is Runx1 / Gfi1 reporter
df_inx_WN_S6 <- read.csv(file.path(dataDir, "INDEX_data/Metadata_WNs6_AGM060.csv")) %>%
  dplyr::rename(FACS_well = OriWell) %>%
  mutate(Plate = gsub("Plate", "", Plate),
         SC_project = 'WN-S6',
         UniqID = paste(SC_project, Plate, FACS_well, sep="_")) %>%
dplyr::select(UniqID, Population2, Runx1b_RFP, Gfi1_GFP) %>%
  mutate(Allele = "RG")

# plate WN S7
df_inx_WN_S7 <- read.csv(file.path(dataDir, "INDEX_data/Metadata_WNs7_WNs8_AGM068.csv")) %>%
  dplyr::rename(FACS_well = OriWell) %>%
  mutate(tmp = Plate,
         SC_project = gsub("_p[1-5]", "", tmp),
         SC_project = gsub("_", "-", SC_project),
         Plate = gsub(".*_p", "", tmp),
         UniqID = paste(SC_project, Plate, FACS_well, sep="_")) %>%
  dplyr::select(UniqID, Population2, Runx1b_RFP, Gfi1_GFP,FACS_well)

# We know which well is only the Gfi1 reporter (based on plate layout excel file)
Gfi_mouse_well <- c(
  paste0(rep("G", each = ),sprintf("%02d",5:12)),
  paste0(rep(LETTERS[8:15], each = 23),sprintf("%02d",1:12)),
  paste0(rep("P", each=3),sprintf("%02d",10:13)))

df_inx_WN_S7 <- df_inx_WN_S7 %>%
  mutate(Allele = if_else(FACS_well %in% Gfi_mouse_well, "G", "RG")) %>%
  dplyr::select(-FACS_well)

df_indx <- rbind(df_inx_WN_S6, df_inx_WN_S7)

# Lets also read INDEX data for EMP and LMP
df_inx_WN_S10 <- read.csv(file.path(dataDir, "INDEX_data/Metadata_WNs10_AGM083.csv")) %>%
  dplyr::rename(FACS_well = OriWell) %>%
  mutate(Plate = "1",
         SC_project = 'WN-S10',
         UniqID = paste(SC_project, Plate, FACS_well, sep="_")) %>%
dplyr::select(UniqID, Population, CD127_eF450:LIN, CD127_PE:CD41_PECy7) %>%
  mutate(Allele = "WT")

Load list of CC genes

s.genes <- cc.genes.updated.2019$s.genes
g2m.genes <- cc.genes.updated.2019$g2m.genes

Read list of cell surface genes

df_cs <- read.delim(file.path(dataDir, "Others/RM_cur_CS_mouse.txt"), stringsAsFactors = FALSE)

Using scater

outDir_inHouse6_scater <- file.path(outDir_cur, "Scater_analysis_YS_only")
dir.create(outDir_inHouse6_scater)
## Warning in dir.create(outDir_inHouse6_scater): '/Users/zfadlullah/Dropbox (The
## University of Manchester)/zaki/PhD/sequencing_runs/2021/2021_06_01_MZ_016/
## output/Chapter_03_InHouse_YS/Scater_analysis_YS_only' already exists

We use the Scater object where Gfi1/1b KO and TechRep were removed and re-run the analysis in scater Simply merge the two objects

We loaded the single cell object in the Set-up section. Simply merge the two objects

dat1 <- assay(se_next, "counts")
dat2 <- assay(se_nova, "counts")
dat <- cbind(dat1,dat2)

c.anno1 <- colData(se_next)
c.anno2 <- colData(se_nova)
c.anno <- rbind(c.anno1, c.anno2)

g.anno <- rowData(se_next)
colnames(g.anno)[5] <- "chr_name"

se <- 
  SingleCellExperiment(
  assays = list(counts = dat),
  colData = c.anno,
  rowData = g.anno)

se$condition <- factor(se$condition, levels=com_popFac)

Retain only the YS cells and remove Gfi1/1b KOs

AnoSite <- se$condition
AnoSite <- gsub("_.*", "", AnoSite)
table(AnoSite)
## AnoSite
##  AGM  MES   YS 
##  702   94 1504
se$Site <- AnoSite
se <- se[, se$Site == "YS"]

# Remove Gfi1/1b KO
se <- se[,!se$condition == "YS_E9.5_HE_Gfi1s_KO"]
# Remove the KIT negative sorted cells
se <- se[,!se$condition %in% c("YS_E9.0_HE_Gfi1_Kneg", "YS_E9.5_HE_Gfi1_Kneg", "YS_E10_HE_Gfi1_Kneg")]

Remove tech duplciate

Now remove technical replicate, we select the ones with the most genes to keep.

cD <- as.data.frame(colData(se))

  
cD <- cD %>%
  # Find the lane in which the sample was sequenced on
  mutate(Lane = gsub("*.L001", "L1", FullName),
         Lane = gsub(".*L002", "L2", Lane),
         Lane = gsub("GL.*", "L1", Lane),
         SeqBatch = paste(SeqRun, Lane, sep="_"),
  # Find the Uniq ID
         Plate_Well = paste(SC_project, SC_plate, FACS_well, sep="_"))

# Identify tech replicates
df_com_twice <- as_tibble(cD) %>%
  dplyr::count(Plate_Well) %>% dplyr::filter(n > 1)

qc_cell <- perCellQCMetrics(se)
cD <- cbind(cD, qc_cell)

cD_noTech <- cD %>%
  arrange(desc(detected)) %>%
  distinct(Plate_Well,.keep_all = TRUE)

# Subset
se <- se[,se$Barcode %in% cD_noTech$Barcode]

Filtering lowly-expressed genes

Remove all ENS with 0 counts

row_sub <- rowSums(dat)
row_sub <- names(row_sub)[row_sub > 0]

Convert to gene names

se <- se[row_sub, ]

new.row.names <- uniquifyFeatureNames(
    rowData(se)$ID,
    rowData(se)$Symbol
)

rownames(se) <- new.row.names
head(rownames(se), 5)
## [1] "Xkr4"    "Gm1992"  "Gm19938" "Gm37381" "Rp1"

Normalisation

set.seed(123)
clust.se <- quickCluster(se) 
se <- computeSumFactors(se, cluster=clust.se, min.mean=0.1)
se <- logNormCounts(se)
#assay(se,"logcounts")["Runx1",]

Feature selection

set.seed(123)
dec.se <- modelGeneVar(se)

# Visualizing the fit:
fit.se <- metadata(dec.se)
plot(fit.se$mean, fit.se$var, xlab="Mean of log-expression",
    ylab="Variance of log-expression")
curve(fit.se$trend(x), col="dodgerblue", add=TRUE, lwd=2)

Cell cycle assginment

mm.pairs <- readRDS(system.file("exdata", "mouse_cycle_markers.rds", 
    package="scran"))


# Using Ensembl IDs to match up with the annotation in 'mm.pairs'.
assignments <- cyclone(se, mm.pairs, gene.names=rowData(se)$ID)
se$Phase <- assignments$phases

saveRDS(se, file.path(outDir_cur, "seNorm_YS.RDS"))

Dim reduction

Select the most variable genes

set.seed(123)
top.prop <- getTopHVGs(dec.se, prop=0.1)
top.se <- getTopHVGs(dec.se, n=2000)
length(top.prop)
## [1] 967
length(top.se)
## [1] 2000
chosen <- top.se

Run dim reduction

se <- runPCA(se, subset_row=chosen) 

percent.var <- attr(reducedDim(se), "percentVar")
plot(percent.var, log="y", xlab="PC", ylab="Variance explained (%)")

UMAP

set.seed(123)
se <- runUMAP(se, dimred="PCA", subset_row=chosen,
              pca=30,
              n_neighbors=30,
              min_dist=0.4)
plotUMAP(se, colour_by="condition")

Clustering

Graph based clustering

# Graph based
g <- buildSNNGraph(se, k=10, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
se$GraphBased_all <- paste0("g_", sprintf("%02d",clust))

Visualise

Visualise UMAP better

umap.df.int <- reducedDim(se, "UMAP") %>%
  cbind(colData(se)) %>%
  as.data.frame() %>%
  dplyr::rename(UMAP_1 = V1,
                UMAP_2 = V2)

Plot

p1 <- 
  ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=SeqRun)) +
  geom_point() +
  gghighlight(use_direct_label = FALSE) +
  theme(legend.position = "right") +
  facet_wrap(~SeqRun) +
  theme_custom +
  #scale_colour_manual(values=tableau20) + 
  labs(title="Highlighting the Sequencing batch")

p2 <-        
  ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=condition)) +
  geom_point() +
  gghighlight(use_direct_label = FALSE) +
  theme(legend.position = "right") +
  facet_wrap(~condition) +
  theme_custom +
  #scale_colour_manual(values=g_hue) + 
  #scale_colour_manual(values=tableau20) + 
  labs(title="Highlighting the FACS population")

p3 <-        
  ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=GraphBased_all)) +
  geom_point() +
  gghighlight(use_direct_label = FALSE) +
  theme(legend.position = "right") +
  facet_wrap(~GraphBased_all) +
  theme_custom +
  #scale_colour_manual(values=g_hue) + 
  #scale_colour_manual(values=tableau20) + 
  labs(title="Highlighting the in-silico clusters")

p4 <-        
  ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=Phase)) +
  geom_point() +
  gghighlight(use_direct_label = FALSE) +
  theme(legend.position = "right") +
  facet_wrap(~Phase) +
  theme_custom +
  #scale_colour_manual(values=g_hue) + 
  #scale_colour_manual(values=tableau20) + 
  labs(title="Highlighting the cell cycle")


png(file.path(outDir_inHouse6_scater, "facet_SeqBatch.png"), height=350, width = 850)
p1
dev.off()
## quartz_off_screen 
##                 2
png(file.path(outDir_inHouse6_scater, "facet_FACS.png"), height=950, width = 1200)
p2
dev.off()
## quartz_off_screen 
##                 2
png(file.path(outDir_inHouse6_scater, "facet_cluster.png"), height=950, width = 1200)
p3
dev.off()
## quartz_off_screen 
##                 2
png(file.path(outDir_inHouse6_scater, "facet_CC.png"), height=350, width = 850)
p4
dev.off()
## quartz_off_screen 
##                 2

We can check the expression of the following genes

plotUMAP(se, colour_by=c("Folr1"))

plotUMAP(se, colour_by=c("Dlk1"))

plotUMAP(se, colour_by=c("Ptn"))

plotUMAP(se, colour_by=c("Hbb-y"))

Save the single cell object

seOri <- se
saveRDS(seOri, file.path(outDir_inHouse6_scater, "SeNotFiltered.RDS"))
#seOri <- readRDS(file.path(outDir_inHouse6_scater, "SeNotFiltered.RDS"))

Remove outlier

outDir_inHouse6_scater_outlier <- file.path(outDir_inHouse6_scater, "Outlier_removed")
dir.create(outDir_inHouse6_scater_outlier)
## Warning in dir.create(outDir_inHouse6_scater_outlier): '/Users/zfadlullah/
## Dropbox (The University of Manchester)/zaki/PhD/sequencing_runs/
## 2021/2021_06_01_MZ_016/output/Chapter_03_InHouse_YS/Scater_analysis_YS_only/
## Outlier_removed' already exists

Things we remove ;

  1. There is a small cluster expressing Folr1
  2. Remove the Hbb expressing cluster
  3. Remove the mesenchymal cluster (expressing Ptn)
  4. Remove cells with high Ribo and low genes

Folr1 - https://pubmed.ncbi.nlm.nih.gov/19180647/

By selecting regions on the UMAP we can remove the cells above

# Remove the Hbb clusters
toRemove_1 <- umap.df.int %>%
  dplyr::filter(GraphBased_all == "g_12") %>%
  pull(Barcode)

Recluster

se <- se[,!se$GraphBased_all == "g_12"]
g <- buildSNNGraph(se, k=5, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
length(unique(clust))
## [1] 13
se$GraphBased_all <- paste0("g_", sprintf("%02d",clust))

Re-plot

umap.df.int2 <- reducedDim(se, "UMAP") %>%
  cbind(colData(se)) %>%
  as.data.frame() %>%
  dplyr::rename(UMAP_1 = V1,
                UMAP_2 = V2)
p5 <- 
ggplot(
  umap.df.int2,
  aes(x=UMAP_1, y=UMAP_2, colour=GraphBased_all)) +
  geom_point() +
  gghighlight(use_direct_label = FALSE) +
  theme(legend.position = "right") +
  facet_wrap(~GraphBased_all) +
  theme_custom +
  #scale_colour_manual(values=g_hue) + 
  #scale_colour_manual(values=tableau20) + 
  labs(title="Highlighting the in-silico clusters")


png(file.path(outDir_inHouse6_scater, "facet_cluster_noHbb.png"), height=950, width = 1200)
p5
dev.off()
## quartz_off_screen 
##                 2

Remove additional cells

plotUMAP(se, colour_by=c("Folr1"))

plotUMAP(se, colour_by=c("Dlk1"))

plotUMAP(se, colour_by=c("Ptn"))

# Remove MES (ptn expressing cells)
toRemove_2 <- umap.df.int2 %>%
  dplyr::filter(GraphBased_all == "g_13") %>%
  pull(Barcode)

# Remove Folr1 expressing cells
toRemove_3 <- umap.df.int2 %>%
  dplyr::filter(GraphBased_all == "g_11") %>%
  pull(Barcode)

Find cells in the cluster with high Ribo and low numebr of genes detected

#se <- seOri
rD <- as.data.frame(rowData(se))

# Specific the location
MT_gene <- rD$chr_name == "chrM"
Hbbs <- rD[grepl("Hb[a-b]", rD$Symbol),] %>% pull(Symbol)
Hbbs <- c(Hbbs, "Gypa")
Ribos_1 <- rD[grepl("Rps", rD$Symbol),] %>% pull(Symbol)
Ribos_2 <- rD[grepl("Rpl", rD$Symbol),] %>% pull(Symbol)
Ribos <- c(Ribos_1, Ribos_2)

# Get the Ens ID
Hb_ens <- rD %>%
  dplyr::filter(Symbol %in% Hbbs)
Hbb_gene <- rowData(se)$ID %in% Hb_ens$ID

Ribo_ens <- rD %>%
  dplyr::filter(Symbol %in% Ribos)
Ribo_gene <- rowData(se)$ID %in% Ribo_ens$ID

#We examine the distibution of the data in all the run


df_cData <- 
  perCellQCMetrics(se,
                   subsets=list(Mito=MT_gene,
                                Hbb=Hbb_gene,
                                Ribo=Ribo_gene)) %>%
  cbind(as.data.frame(colData(se))) %>%
  as.data.frame() %>%
  dplyr::mutate(Cell = colnames(se))


se$detected <- df_cData$detected
se$RiboPerc <- df_cData$subsets_Ribo_percent
se$MitoPerc <- df_cData$subsets_Mito_percent

plotColData(se, x="GraphBased_all", y="MitoPerc")

plotColData(se, x="GraphBased_all", y="RiboPerc")

plotColData(se, x="GraphBased_all", y="detected")

# Remove the cluster with highest Ribo which also has lowest number of detected genes
toRemove_4 <- umap.df.int2 %>%
  filter(GraphBased_all == "g_12") %>%
  pull(Barcode)

Theres another cluster (g_10) that has low genes and high Riba as well.

# ----- Scran -------- #
# Quick check what is expressed in g_10
colLabels(se) <- se$GraphBased_all

# We can change the pval.type="all" to be more stringent
markers.se <- findMarkers(se, pval.type="some", direction="up")

chosen_clus <- "g_10"
interesting <- markers.se[[chosen_clus]]
#best.set <- interesting[interesting$Top <= 10,]

plotUMAP(se, colour_by=c("Nutf2-ps1"))


# ------ Seurat -------- #
seuset <- Seurat::as.Seurat(se)
seuset <- NormalizeData(seuset)

Idents(seuset) <- seuset$GraphBased_all

su.markers <- 
  Seurat::FindMarkers(seuset, only.pos = FALSE, 
                       ident.1 = c("g_10"),
                       #ident.2 = c("C05"),
                       min.pct = 0.25, logfc.threshold = 0.25)

top_x <- su.markers %>% 
  tibble::rownames_to_column(var = "gene") %>%
  dplyr::filter(p_val_adj < 0.05) %>%
  top_n(n = 10, wt = avg_log2FC)

FeaturePlot(seuset, top_x$gene)
FeaturePlot(seuset, row.names(interesting)[1:10])

We remove this cluster as well

toRemove_5 <- umap.df.int2 %>%
  filter(GraphBased_all == "g_10") %>%
  pull(Barcode)

Remove the selected cells

toRemove <- unique(c(toRemove_1, toRemove_2, toRemove_3, toRemove_4, toRemove_5))

Plot the selected cells

umap.df.int3 <- umap.df.int %>%
  dplyr::mutate(isSel = if_else(Barcode %in% toRemove, "Discard", "Retain"))

p1 <-        
  ggplot(umap.df.int3, aes(x=UMAP_1, y=UMAP_2, colour=isSel)) +
  geom_point() +
  #gghighlight(use_direct_label = FALSE) +
  theme(legend.position = "right") +
  #facet_wrap(~cluster) +
  theme_custom 


png(file.path(outDir_inHouse6_scater, "SelectedCells.png"), height=300, width = 450)
p1
dev.off()
## quartz_off_screen 
##                 2

Subset to the selected cells, and re-do the HVG selection, dim reduction etc

#seAll <- se
#se <- seAll

SelctedCells <- umap.df.int3 %>%
  dplyr::filter(isSel == "Retain")
se <- se[,se$Barcode %in% SelctedCells$Barcode]

set.seed(123)
dec.se <- modelGeneVar(se)
top.se <- getTopHVGs(dec.se, n=2500)
chosen <- top.se

se <- runPCA(se, subset_row=chosen) 

set.seed(123)
se <- runUMAP(se, dimred="PCA", subset_row=chosen,
              pca=30,
              n_neighbors=30,
              min_dist=0.3)
plotUMAP(se, colour_by="condition")

UMAP when CC is removed

diff <- getVarianceExplained(se, "Phase")
discard <- diff > 5

top.hvgs2 <- getTopHVGs(se[which(!discard),], n=2500)
se.nocycle <- runPCA(se, subset_row=top.hvgs2)

set.seed(9999)
se.nocycle <- runUMAP(se.nocycle, dimred="PCA", subset_row=top.hvgs2,
              pca=30,
              n_neighbors=30,
              min_dist=0.4)
plotUMAP(se.nocycle, colour_by="condition")

chosen2 <- top.hvgs2

Clustering

Re-do clustering

# Hirarchical clustering
my.dist  <- dist(reducedDim(se, "PCA")[,1:0])
my.tree <- hclust(my.dist, "ward.D2")
my.dend <- as.dendrogram(my.tree, hang=0.1)
my.clusters <- unname(cutreeDynamic(my.tree, distM=as.matrix(my.dist), verbose=0, minClusterSize=10, deepSplit=2))
#my.clusters <- paste0("k_", my.clusters)
length(unique(my.clusters))

# Making a prettier dendrogram.
my.tree$labels <- seq_along(my.tree$labels)
#dend <- as.dendrogram(my.tree, hang=0.1)
numClus <- length(unique(my.clusters))
coltree <- gg_color_hue(n=numClus)

my.tree_or <- my.clusters[order.dendrogram(my.dend)]

# Setting up colours for the FACS population
cFACS <- coltree[as.factor(se$TreeCut)]
cFACS <- coltree[as.factor(se$TreeCut)[order.dendrogram(dend)]]

no0_unique <- 
function(x) {
    u_x <- unique(x)
    u_x[u_x != 0]
}

clusters_numbers <- no0_unique(my.tree_or)
n_clusters <- length(clusters_numbers)
cols <- gg_color_hue(n=n_clusters)
dend2 <- branches_attr_by_clusters(my.dend, my.tree_or, values = cols) %>%
  set("labels_col", "black")
plot(dend2)
colored_bars(cFACS, dend2, sort_by_labels_order = FALSE, y_shift=10)

UniqClus <- unique(my.tree_or)

pdf(file.path(outDir_inHouse6_scater_outlier, "Dend_PCA.pdf"),height = 4, width = 12)
plot(dend2)
colored_bars(cFACS, dend2, sort_by_labels_order = FALSE, y_shift=10)

plot(UniqClus, ylim = c(-1, length(UniqClus)))
text(x =1:length(UniqClus), y=-1, UniqClus)

dev.off()

se$TreeCut <- paste0("k_", sprintf("%02d",my.clusters))
# Hierarch by gene exp
chosen.exprs <- assay(se, "logcounts")[chosen,]
my.dist = dist(as.matrix(t(chosen.exprs)), method = "euclidean")
my.tree <- hclust(my.dist, method="ward.D2")
my.dend <- as.dendrogram(my.tree, hang=0.1)
my.clusters <- unname(cutreeDynamic(my.tree, distM=as.matrix(my.dist), verbose=0, minClusterSize=10, deepSplit=2))
#my.clusters <- paste0("k_", my.clusters)
length(unique(my.clusters))
## [1] 10
# Making a prettier dendrogram.
my.tree$labels <- seq_along(my.tree$labels)
#dend <- as.dendrogram(my.tree, hang=0.1)
numClus <- length(unique(my.clusters))
coltree <- gg_color_hue(n=numClus)

my.tree_or <- my.clusters[order.dendrogram(my.dend)]

# Setting up colours for the FACS population
cFACS <- coltree[as.factor(se$TreeCut)]
cFACS <- coltree[as.factor(se$TreeCut)[order.dendrogram(my.dend)]]

no0_unique <- 
function(x) {
    u_x <- unique(x)
    u_x[u_x != 0]
}

clusters_numbers <- no0_unique(my.tree_or)
n_clusters <- length(clusters_numbers)
cols <- gg_color_hue(n=n_clusters)
dend2 <- branches_attr_by_clusters(my.dend, my.tree_or, values = cols) %>%
  set("labels_col", "black")
plot(dend2)
colored_bars(cFACS, dend2, sort_by_labels_order = FALSE, y_shift=10)

UniqClus <- unique(my.tree_or)

pdf(file.path(outDir_inHouse6_scater_outlier, "Dend.pdf"),height = 4, width = 12)
plot(dend2)
colored_bars(cFACS, dend2, sort_by_labels_order = FALSE, y_shift=10)

plot(UniqClus, ylim = c(-1, length(UniqClus)))
text(x =1:length(UniqClus), y=-1, UniqClus)

dev.off()
## quartz_off_screen 
##                 2
se$TreeCut <- paste0("k_", sprintf("%02d",my.clusters))

Graph based

# Re-run clusters
g <- buildSNNGraph(se, k=10, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
se$GraphBased <- paste0("g_", sprintf("%02d",clust))

Visualise

Visualise UMAP better

umap.df.int <- reducedDim(se, "UMAP") %>%
  cbind(colData(se)) %>%
  as.data.frame() %>%
  dplyr::rename(UMAP_1 = V1,
                UMAP_2 = V2)

Plot

p1 <- 
  ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=SeqRun)) +
  geom_point() +
  gghighlight(use_direct_label = FALSE) +
  theme(legend.position = "right") +
  facet_wrap(~SeqRun) +
  theme_custom +
  #scale_colour_manual(values=tableau20) + 
  labs(title="Highlighting the Sequencing batch")

p2 <-        
  ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=condition)) +
  geom_point() +
  gghighlight(use_direct_label = FALSE) +
  theme(legend.position = "right") +
  facet_wrap(~condition) +
  theme_custom +
  #scale_colour_manual(values=g_hue) + 
  #scale_colour_manual(values=tableau20) + 
  labs(title="Highlighting the FACS population")

p3 <-        
  ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=TreeCut)) +
  geom_point() +
  gghighlight(use_direct_label = FALSE) +
  theme(legend.position = "right") +
  facet_wrap(~TreeCut) +
  theme_custom +
  #scale_colour_manual(values=g_hue) + 
  #scale_colour_manual(values=tableau20) + 
  labs(title="Highlighting the in-silico clusters")


p3.5 <-        
  ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=GraphBased)) +
  geom_point() +
  gghighlight(use_direct_label = FALSE) +
  theme(legend.position = "right") +
  facet_wrap(~GraphBased) +
  theme_custom +
  #scale_colour_manual(values=g_hue) + 
  #scale_colour_manual(values=tableau20) + 
  labs(title="Highlighting the in-silico clusters")

p4 <-        
  ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=Phase)) +
  geom_point() +
  gghighlight(use_direct_label = FALSE) +
  theme(legend.position = "right") +
  facet_wrap(~Phase) +
  theme_custom +
  #scale_colour_manual(values=g_hue) + 
  #scale_colour_manual(values=tableau20) + 
  labs(title="Highlighting the cell cycle")


png(file.path(outDir_inHouse6_scater_outlier, "facet_SeqBatch.png"), height=350, width = 850)
p1
dev.off()
## quartz_off_screen 
##                 2
png(file.path(outDir_inHouse6_scater_outlier, "facet_FACS.png"), height=950, width = 1200)
p2
dev.off()
## quartz_off_screen 
##                 2
png(file.path(outDir_inHouse6_scater_outlier, "facet_cluster_tree.png"), height=950, width = 1200)
p3
dev.off()
## quartz_off_screen 
##                 2
png(file.path(outDir_inHouse6_scater_outlier, "facet_cluster_graph_default.png"), height=950, width = 1200)
p3.5
dev.off()
## quartz_off_screen 
##                 2
png(file.path(outDir_inHouse6_scater_outlier, "facet_CC.png"), height=350, width = 850)
p4
dev.off()
## quartz_off_screen 
##                 2

Make a nice UMAP plot

umap.df.int <- umap.df.int %>%
  dplyr::mutate(Estage = gsub("_Endo", "", condition),
                Estage = gsub("_HE.*", "", Estage),
                Estage = gsub("_EMP.*", "", Estage),
                Estage = gsub("_LMP.*", "", Estage),
                Estage = gsub("YS_", "", Estage),
                Estage_fac = case_when(Estage == "E9.0" ~ "g1",
                                       Estage == "E9.5" ~ "g2",
                                       Estage == "E10" ~ "g3",
                                       Estage == "E10.5" ~ "g3"),
                Cond_fac = gsub(".*_Endo", "Endo", condition),
                Cond_fac = gsub(".*Kneg", "Kneg", Cond_fac),
                Cond_fac = gsub(".*_HE.*", "HE", Cond_fac),
                Cond_fac = gsub(".*_EMP", "EMP", Cond_fac),
                Cond_fac = gsub(".*_LMP", "LMP", Cond_fac),
                Cond_fac = factor(Cond_fac, levels = c("Endo", "Kneg", "HE", "EMP", "LMP"))
                )

pal.YS <- c(
  
  rep("#4E79A7", 3), # Endo
  #rep("#F28E2B", 3), # Kit Neg
  rep("#D82526", 6), # Runx1 & GFi1
  #rep("#69B764", 3), # Gfi1
  rep("#7a0177", 2), # EMP
  rep("#f768a1", 2) # LMP
)
  
  
p1 <-        
  ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=condition)) +
  geom_point(size=1) +
  gghighlight(use_direct_label = FALSE) +
  theme_custom +
  facet_wrap( Cond_fac ~ Estage_fac, ncol=3) +
  theme(
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    legend.position = "none") +
  scale_colour_manual(values=pal.YS) + 
  #scale_colour_manual(values=tableau20) + 
  labs(title="Highlighting the FACS")

p1

pdf(file.path(outDir_inHouse6_scater_outlier, "facet_eStage_pop.pdf"), height=10, width = 14)
p1
dev.off()
## quartz_off_screen 
##                 2
ggsave(filename = file.path(outDir_inHouse6_scater_outlier, "facet_eStage_pop2.pdf"), device = "pdf", width = 8, height = 12, dpi = 300)

Remove contaminating “LMP”

outDir_noLMP <- file.path(outDir_inHouse6_scater, "no_Platelet")
dir.create(outDir_noLMP)
## Warning in dir.create(outDir_noLMP): '/Users/zfadlullah/Dropbox (The University
## of Manchester)/zaki/PhD/sequencing_runs/2021/2021_06_01_MZ_016/output/
## Chapter_03_InHouse_YS/Scater_analysis_YS_only/no_Platelet' already exists

We noticed an the LMP population split into two. Based on expression of key genes this is the Platelet popualtion, in other words contamination.

geneDir <- file.path(outDir_noLMP, "gene_exp")
dir.create(geneDir)
## Warning in dir.create(geneDir): '/Users/zfadlullah/Dropbox (The University
## of Manchester)/zaki/PhD/sequencing_runs/2021/2021_06_01_MZ_016/output/
## Chapter_03_InHouse_YS/Scater_analysis_YS_only/no_Platelet/gene_exp' already
## exists
gg <- "Gp9"
expInt <- assay(se, "logcounts")[gg,]
#exp2 <- sacle(expInt)
#exp2 <- as.vector(exp2)
df_plot <- umap.df.int %>%
  mutate(gene = expInt)

ggplot(df_plot, aes(x=UMAP_1, y=UMAP_2, colour=gene)) +
  geom_point() +
  theme_custom +
  theme(axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank(),
        legend.position = "none") +
  scale_colour_viridis(option = "plasma", na.value = "grey80", begin = 0, end = 1) +
  NULL

ggsave(filename = file.path(geneDir, paste0(gg, "_UMAP.png")), device = "png", width = 6, height = 4, dpi = 300)

Lets remove the contaminating cluster. We can remove it by the in silico cluster or we use the INDEX FACS data. Lets use the INDEX data and merge it

cD <- as.data.frame(colData(se))

# Join with index FACS info
dd <- cD %>%
  mutate(UniqID = paste(SC_project, SC_plate, FACS_well, sep="_")) %>%
  left_join(df_inx_WN_S10)
## Joining, by = "UniqID"
umap.df.int <- umap.df.int %>%
  dplyr::mutate(Barcode = se$Barcode)

dd_sub <- dd %>%
  dplyr::filter(condition %in% c("YS_E9.5_EMP", "YS_E10.5_EMP",
                                 "YS_E9.5_LMP", "YS_E10.5_LMP")) %>%
  #dplyr::filter(Allele == "RG") %>%
  mutate(lCD127_eF450 = log10(CD127_eF450 + 200) ,
         lCD127_PE = log10(CD127_PE + 200),
         
         lCD41_PE = log10(CD41_PE+ 200) ,
         lCD41_PECy7 = log10(CD41_PECy7 + 200)) %>%
  mutate(lCD41_PECy7_cutOff = if_else(lCD41_PECy7 >= 4.5, "high", "low")) %>%
  dplyr::left_join(dplyr::select(umap.df.int, UMAP_1, UMAP_2, Barcode)) 
## Joining, by = "Barcode"

Plot CD41 in UMAP

ggplot(dd_sub, aes(x=UMAP_1, y=UMAP_2, colour=lCD127_PE)) +
  geom_point() +
  theme_custom +
  theme(axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank(),
        legend.position = "right") +
  scale_colour_viridis(option = "plasma", na.value = "grey80", begin = 0, end = 1) +
  NULL

ggplot(dd_sub, aes(x=UMAP_1, y=UMAP_2, colour=lCD41_PECy7)) +
  geom_point() +
  theme_custom +
  theme(axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank(),
        legend.position = "right") +
  scale_colour_viridis(option = "plasma", na.value = "grey80", begin = 0, end = 1) +
  NULL

ggplot(dd_sub, aes(x=UMAP_1, y=UMAP_2, colour=lCD41_PECy7_cutOff)) +
  geom_point() +
  theme_custom +
  theme(axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank(),
        legend.position = "right") +
  #scale_colour_viridis(option = "plasma", na.value = "grey80", begin = 0, end = 1) +
  NULL

Plot FACS

ggplot(dd_sub, aes(x=lCD41_PECy7, y=lCD127_PE, colour=condition)) +
  geom_point() +
  theme_custom
## Warning: Removed 64 rows containing missing values (geom_point).

ggplot(dd_sub, aes(x=lCD41_PECy7, y=lCD127_PE, colour=lCD41_PECy7_cutOff)) +
  geom_point() +
  theme_custom
## Warning: Removed 64 rows containing missing values (geom_point).

Now lets remove cells which have high CD41 based on INDEX sort

toRemove_nonLMP <- dd_sub %>%
  dplyr::filter(lCD41_PECy7_cutOff == "high") %>%
  pull(Barcode)

Re-run HVG & cluster

#se2 <- se
se <- se[,!se$Barcode %in% toRemove_nonLMP]

set.seed(123)
dec.se <- modelGeneVar(se)
top.se <- getTopHVGs(dec.se, n=3500)
chosen <- top.se

se <- runPCA(se, subset_row=chosen) 

set.seed(123)
se <- runUMAP(se, dimred="PCA", subset_row=chosen,
              pca=30,
              n_neighbors=35,
              min_dist=0.25,
              
              #n_neighbors=30,
              #min_dist=0.3,
              
              ncomponents = 2)
plotUMAP(se, colour_by="condition")

Try plot 3D

umap.df <- as.data.frame(reducedDim(se, "UMAP")) %>%
    cbind(colData(se)) %>%
  as.data.frame() 

# 3D plot
x1 <- umap.df$V1
x2 <- umap.df$V2
x3 <- umap.df$V3

df_pal <- df.col_FACS %>%
  dplyr::filter(FACS %in% umap.df$condition)
pal <- df_pal$Pal

kk <- umap.df$condition
kk.1 <- unique(umap.df$condition)
kk <- factor(as.character(kk), 
                levels=c(
                  "AGM_E10.5_Endo", "AGM_E10.5_HE_Runx1", "AGM_E10.5_HE_Gfi1", "AGM_E10.5_EHT", "AGM_E10.5_IAHC",
                
                "YS_E9.0_Endo", "YS_E9.5_Endo", "YS_E10.5_Endo", 
                "YS_E9.0_HE_Gfi1_Kneg", "YS_E9.5_HE_Gfi1_Kneg", "YS_E10_HE_Gfi1_Kneg",
                
                "YS_E9.0_HE_Runx1", "YS_E9.5_HE_Runx1", "YS_E10.5_HE_Runx1", 
                "YS_E9.0_HE_Gfi1", "YS_E9.5_HE_Gfi1", "YS_E10.5_HE_Gfi1",
                
                "YS_E9.5_EMP","YS_E10.5_EMP",
                "YS_E9.5_LMP", "YS_E10.5_LMP"))
kk[is.na(pal[kk])] %>% unique()

pal <- pal[kk]

plot3d(x1, x2, x3, col = pal)


# Or use plotly
plot_ly(umap.df, x= ~V1, y= ~V2, z= ~V3, color =  ~condition)
umap.df <- reducedDim(se, "UMAP") %>%
  cbind(colData(se)) %>%
  as.data.frame() %>%
  dplyr::rename(UMAP_1 = V1,
                UMAP_2 = V2,
                UMAP_3 = V3) %>%
  # Add a column to simplify the annotation
  mutate(
    MainPop = gsub("AGM_", "", condition),
    MainPop = gsub("YS_", "", condition),
    
    MainPop = gsub("HE_Gfi1", "HE", MainPop),
    MainPop = gsub("HE_Runx1", "HE", MainPop)
  )


x1 <- umap.df$UMAP_1
x2 <- umap.df$UMAP_2
x3 <- umap.df$UMAP_3

library(rgl)
pal <- df.col_FACS %>%
  dplyr::filter(FACS %in% umap.df$condition) %>%
  pull(Pal)
kk <- umap.df.int$condition
kk <- as.factor(as.character(kk))
pal <- pal[kk]

plot3d(x1, x2, x3, col = pal)

Cluster with h clustering

set.seed(123)
dec.se <- modelGeneVar(se)
top.se <- getTopHVGs(dec.se, n=3000)
chosen <- top.se


# Hierarch by gene exp
chosen.exprs <- assay(se, "logcounts")[chosen,]
my.dist = dist(as.matrix(t(chosen.exprs)), method = "euclidean")
my.tree <- hclust(my.dist, method="ward.D2")
my.dend <- as.dendrogram(my.tree, hang=0.1)
my.clusters <- unname(cutreeDynamic(my.tree, 
                                    distM=as.matrix(my.dist), 
                                    verbose=0, minClusterSize=10, 
                                    deepSplit=2))
#my.clusters <- paste0("k_", my.clusters)
length(unique(my.clusters))
## [1] 10
# Making a prettier dendrogram.
my.tree$labels <- seq_along(my.tree$labels)
#dend <- as.dendrogram(my.tree, hang=0.1)
numClus <- length(unique(my.clusters))
coltree <- gg_color_hue(n=numClus)

my.tree_or <- my.clusters[order.dendrogram(my.dend)]

# Setting up colours for the FACS population
cFACS <- coltree[as.factor(se$TreeCut)]
cFACS <- coltree[as.factor(se$TreeCut)[order.dendrogram(my.dend)]]

no0_unique <- 
function(x) {
    u_x <- unique(x)
    u_x[u_x != 0]
}

clusters_numbers <- no0_unique(my.tree_or)
n_clusters <- length(clusters_numbers)
cols <- gg_color_hue(n=n_clusters)
dend2 <- branches_attr_by_clusters(my.dend, my.tree_or, values = cols) %>%
  set("labels_col", "black")
plot(dend2)
colored_bars(cFACS, dend2, sort_by_labels_order = FALSE, y_shift=10)

UniqClus <- unique(my.tree_or)

pdf(file.path(outDir_noLMP, "Dend.pdf"),height = 4, width = 12)
plot(dend2)
colored_bars(cFACS, dend2, sort_by_labels_order = FALSE, y_shift=10)

plot(UniqClus, ylim = c(-1, length(UniqClus)))
text(x =1:length(UniqClus), y=-1, UniqClus)

dev.off()
## quartz_off_screen 
##                 2
se$TreeCut <- paste0("k_", sprintf("%02d",my.clusters))

Graph based

# Re-run clusters
g <- buildSNNGraph(se, k=10, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
se$GraphBased <- paste0("g_", sprintf("%02d",clust))

Visualise

Visualise UMAP better

umap.df.int <- reducedDim(se, "UMAP") %>%
  cbind(colData(se)) %>%
  as.data.frame() %>%
  dplyr::rename(UMAP_1 = V1,
                UMAP_2 = V2) %>%
  dplyr::mutate(UMAP_1 = -1 * UMAP_1)

Plot

p1 <- 
  ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=SeqRun)) +
  geom_point() +
  gghighlight(use_direct_label = FALSE) +
  theme(legend.position = "right") +
  facet_wrap(~SeqRun) +
  theme_custom +
  #scale_colour_manual(values=tableau20) + 
  labs(title="Highlighting the Sequencing batch")

p2 <-        
  ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=condition)) +
  geom_point() +
  gghighlight(use_direct_label = FALSE) +
  theme(legend.position = "right") +
  facet_wrap(~condition) +
  theme_custom +
  #scale_colour_manual(values=g_hue) + 
  #scale_colour_manual(values=tableau20) + 
  labs(title="Highlighting the FACS population")

p3 <-        
  ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=TreeCut)) +
  geom_point() +
  gghighlight(use_direct_label = FALSE) +
  theme(legend.position = "right") +
  facet_wrap(~TreeCut) +
  theme_custom +
  #scale_colour_manual(values=g_hue) + 
  #scale_colour_manual(values=tableau20) + 
  labs(title="Highlighting the in-silico clusters")


p3.5 <-        
  ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=GraphBased)) +
  geom_point() +
  gghighlight(use_direct_label = FALSE) +
  theme(legend.position = "right") +
  facet_wrap(~GraphBased) +
  theme_custom +
  #scale_colour_manual(values=g_hue) + 
  #scale_colour_manual(values=tableau20) + 
  labs(title="Highlighting the in-silico clusters")

p4 <-        
  ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=Phase)) +
  geom_point() +
  gghighlight(use_direct_label = FALSE) +
  theme(legend.position = "right") +
  facet_wrap(~Phase) +
  theme_custom +
  #scale_colour_manual(values=g_hue) + 
  #scale_colour_manual(values=tableau20) + 
  labs(title="Highlighting the cell cycle")


png(file.path(outDir_noLMP, "facet_SeqBatch.png"), height=350, width = 850)
p1
dev.off()
## quartz_off_screen 
##                 2
png(file.path(outDir_noLMP, "facet_FACS.png"), height=950, width = 1200)
p2
dev.off()
## quartz_off_screen 
##                 2
png(file.path(outDir_noLMP, "facet_cluster_tree.png"), height=950, width = 1200)
p3
dev.off()
## quartz_off_screen 
##                 2
png(file.path(outDir_noLMP, "facet_cluster_graph_default.png"), height=950, width = 1200)
p3.5
dev.off()
## quartz_off_screen 
##                 2
png(file.path(outDir_noLMP, "facet_CC.png"), height=350, width = 850)
p4
dev.off()
## quartz_off_screen 
##                 2

Fine cluster adjustment

Lets simplify the hierachical clustering

#my.clusters <- cutree(my.tree, k=3)
#my.clusters <- unname(cutreeDynamic(my.tree, distM=as.matrix(my.dist), verbose=0, minClusterSize=10, deepSplit=2))

umap.df.int <- umap.df.int %>%
  dplyr::mutate(TreeCut_re = TreeCut,
                TreeCut_re = 
                  case_when(
                    TreeCut_re == "k_01" ~ 'k_2',
                    TreeCut_re == "k_02" ~ 'k_3',
                    TreeCut_re == "k_03" ~ 'k_2',
                    TreeCut_re == "k_04" ~ 'k_4',
                    TreeCut_re == "k_05" ~ 'k_5',
                    TreeCut_re == "k_06" ~ 'k_2',
                    TreeCut_re == "k_07" ~ 'k_1',
                    TreeCut_re == "k_08" ~ 'k_1',
                    TreeCut_re == "k_09" ~ 'k_2',
                    TreeCut_re == "k_10" ~ 'k_2'
                    ))

se$TreeCut_re <- umap.df.int$TreeCut_re

Plot the UMAP

# Setting up colours for the clusters
pal.YS_clust_simple <- 
  c(
  "#A0CBE8", # Blue
  "#FFBE7D", # Orange-sh
  #"#BD8A4C", # Brown / Green
  "#D4A6C8", "#631879", "#A20056" # Pink shades
  )


p3.5 <-        
  ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=TreeCut_re)) +
  geom_point() +
  gghighlight(use_direct_label = FALSE) +
  theme(legend.position = "right") +
  facet_wrap(~TreeCut_re) +
  theme_custom +
  scale_colour_manual(values=pal.YS_clust_simple) + 
  #scale_colour_manual(values=tableau20) + 
  labs(title="Highlighting the in-silico clusters")

png(file.path(outDir_noLMP, "facet_cluster_tree_refactor.png"), height=950, width = 1200)
p3.5
dev.off()
## quartz_off_screen 
##                 2

Re-draw the dendogram. Re draw as it is

clust_num <- as.integer(gsub("k_", "", umap.df.int$TreeCut_re))

clusters_numbers <- no0_unique(clust_num)
n_clusters <- length(clusters_numbers)
my.tree_or2 <- 
  case_when(
    my.tree_or == "1" ~ '2',
    my.tree_or == "2" ~ '3',
    my.tree_or == "3" ~ '2',
    my.tree_or == "4" ~ '4',
    my.tree_or == "5" ~ '5',
    my.tree_or == "6" ~ '2',
    my.tree_or == "7" ~ '1',
    my.tree_or == "8" ~ '1',
    my.tree_or == "9" ~ '2',
    my.tree_or == "10" ~ '2') %>%
  as.integer()



df_order <- data.frame(
  TreeLab = my.dend %>% labels(),
  Clust = my.tree_or2) %>%
  dplyr::arrange(Clust)

cols <- pal.YS_clust_simple[c(5, 4, 3, 1, 2)]
dend2 <- branches_attr_by_clusters(my.dend, my.tree_or2, values = cols) %>%
  set("labels_col", "black") %>%
  rotate(as.character(df_order$TreeLab)) 
plot(dend2)

df_order2 <- data.frame(
  TreeLab = dend2 %>% labels()
) %>%
  left_join(df_order)
## Joining, by = "TreeLab"
UniqClus <- unique(df_order2$Clust)


pdf(file.path(outDir_noLMP, "Dend_reOrder.pdf"),height = 4, width = 12)
plot(dend2)
plot(UniqClus, ylim = c(-1, length(UniqClus)))
text(x =1:length(UniqClus), y=-1, UniqClus)

dev.off()
## quartz_off_screen 
##                 2

Nice Vis

We make a nicer UMAP for publication

umap.df.int <- umap.df.int %>%
  dplyr::mutate(Estage = gsub("_Endo", "", condition),
                Estage = gsub("_HE.*", "", Estage),
                Estage = gsub("_EMP.*", "", Estage),
                Estage = gsub("_LMP.*", "", Estage),
                Estage = gsub("YS_", "", Estage),
                Estage_fac = case_when(Estage == "E9.0" ~ "g1",
                                       Estage == "E9.5" ~ "g2",
                                       Estage == "E10" ~ "g3",
                                       Estage == "E10.5" ~ "g3"),
                Cond_fac = gsub(".*_Endo", "Endo", condition),
                Cond_fac = gsub(".*Kneg", "Kneg", Cond_fac),
                Cond_fac = gsub(".*_HE.*", "HE", Cond_fac),
                Cond_fac = gsub(".*_EMP", "EMP", Cond_fac),
                Cond_fac = gsub(".*_LMP", "LMP", Cond_fac),
                Cond_fac = factor(Cond_fac, levels = c("Endo", "Kneg", "HE", "EMP", "LMP"))
                )

pal.YS <- c(
  
  rep("#4E79A7", 3), # Endo
  #rep("#F28E2B", 3), # Kit Neg
  rep("#D82526", 6), # Runx1 & GFi1
  #rep("#69B764", 3), # Gfi1
  rep("#7a0177", 2), # EMP
  rep("#f768a1", 2) # LMP
)
  
  
p1 <-        
  ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=condition)) +
  geom_point(size=1) +
  gghighlight(use_direct_label = FALSE) +
  theme_custom +
  facet_wrap( Cond_fac ~ Estage_fac, ncol=3) +
   theme(
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    legend.position = "none") +
  scale_colour_manual(values=pal.YS) + 
  #scale_colour_manual(values=tableau20) + 
  labs(title="Highlighting the FACS")

p1

pdf(file.path(outDir_noLMP, "facet_eStage_pop.pdf"), height=10, width = 14)
p1
dev.off()
## quartz_off_screen 
##                 2
ggsave(filename = file.path(outDir_noLMP, "facet_eStage_pop2.pdf"), device = "pdf", width = 8, height = 12, dpi = 300)

Show and group clusters

umap.df.int <- umap.df.int %>%
  dplyr::mutate(ClusGroup = 
                  case_when(TreeCut == "k_02" ~ "g1",
                            TreeCut == "k_04" ~ "g1",
                            TreeCut == "k_05" ~ "g1",
                            TreeCut == "k_08" ~ "g2",
                            TreeCut == "k_03" ~ "g2",
                            TreeCut == "k_06" ~ "g2",
                            TreeCut == "k_07" ~ "g2",
                            TreeCut == "k_09" ~ "g3",
                            TreeCut == "k_01" ~ "g3",
                            ))

p2 <-        
  ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=TreeCut)) +
  geom_point() +
  gghighlight(use_direct_label = FALSE) +
  theme(legend.position = "right") +
  facet_wrap(~ ClusGroup, ncol=3) +
  theme_custom +
  #scale_colour_manual(values=g_hue) + 
  #scale_colour_manual(values=tableau20) + 
  labs(title="Highlighting the cluster")


pdf(file.path(outDir_noLMP, "facet_TreeCluster.pdf"), height=3, width = 6)
p2
dev.off()

Show the Graph based clustering

pal.YS_clus <- c(
  "#225ea8", "#A0CBE8", # Blue
  "#F28E2B", "#FFBE7D", "#F1CE63", # Orange-sh
  "#919C4C", "#C3C377","#BD8A4C", # 6-8 (Green)
  "#D4A6C8", "#631879", "#A20056" # Pink
  )

pal.YS_clus <- c(
  "#225ea8", "#A0CBE8", # Blue
  "#F28E2B", "#FFBE7D", # Orange-sh
  "#919C4C", "#C3C377","#BD8A4C", # 6-8 (Green)
  "#D4A6C8", "#631879", "#A20056" # Pink
  )

p2 <-        
  ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=GraphBased)) +
  geom_point() +
  gghighlight(use_direct_label = FALSE) +
  theme(legend.position = "right") +
  #facet_wrap(~ ClusGroup, ncol=3) +
  theme_custom +
  scale_colour_manual(values=pal.YS_clus) + 
  #scale_colour_manual(values=tableau20) + 
  labs(title="Highlighting the cluster")


# Add text to the plot
clust_data <- umap.df.int %>%
  group_by(GraphBased) %>%
  dplyr::summarise(UMAP_1 = mean(UMAP_1),
                   UMAP_2 = mean(UMAP_2))

p2.1 <- p2 +
  geom_point(data = clust_data, aes(fill = GraphBased),
                 size = 10, shape = 21, colour = "white") +
      scale_fill_manual(values = pal.YS_clus) +
      geom_text(data = clust_data, aes(label = GraphBased),
                colour = "black") +
  theme(legend.position = "none") 

p2.1

ggsave(filename = file.path(outDir_noLMP, "UMAP_GraphCluster.pdf"), device = "pdf", width = 7, height = 4.5, dpi = 300)

Show the Tree cut

pal.YS_clus <- c(
  "#225ea8", "#A0CBE8", # Blue
  "#F28E2B", "#FFBE7D", "#F1CE63", # Orange-sh
  "#919C4C", "#C3C377","#BD8A4C", # 6-8 (Green)
  "#D4A6C8", "#631879", "#A20056" # Pink
  )

pal.YS_clus <- c(
  "#225ea8", "#A0CBE8", # Blue
  "#F28E2B", "#FFBE7D", # Orange-sh
  "#919C4C", "#C3C377","#BD8A4C", # 6-8 (Green)
  "#D4A6C8", "#631879", "#A20056" # Pink
  )


p2 <-        
  ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=TreeCut_re)) +
  geom_point() +
  gghighlight(use_direct_label = FALSE) +
  theme(legend.position = "right") +
  #facet_wrap(~ ClusGroup, ncol=3) +
  theme_custom +
  scale_colour_manual(values=pal.YS_clust_simple) + 
  #scale_colour_manual(values=tableau20) + 
  labs(title="Highlighting the cluster")


# Add text to the plot
clust_data <- umap.df.int %>%
  group_by(TreeCut_re) %>%
  dplyr::summarise(UMAP_1 = mean(UMAP_1),
                   UMAP_2 = mean(UMAP_2))

p2.1 <- p2 +
  geom_point(data = clust_data, aes(fill = TreeCut_re),
                 size = 10, shape = 21, colour = "white") +
      scale_fill_manual(values = pal.YS_clust_simple) +
      geom_text(data = clust_data, aes(label = TreeCut_re),
                colour = "black") +
  theme(legend.position = "none") 

p2.1

# 35w 25h
ggsave(filename = file.path(outDir_noLMP, "UMAP_TreeCut.pdf"), device = "pdf", width = 6, height = 4, dpi = 300)

We can show in a hull plot

p_hull_1 <-
ggplot(umap.df.int, aes(x = UMAP_1, y = UMAP_2)) +
  geom_mark_hull(
    aes(fill=TreeCut_re, label=TreeCut_re), 
    alpha=0.5,
    concavity=10, expand = unit(2, "mm"), radius = unit(1, "mm")) +
  geom_point(size = 0.5, colour = "grey50") +
  scale_fill_manual(values = pal.YS_clust_simple) +
  theme_void() +
  theme(legend.position = "none") +
  NULL
p_hull_1

ggsave(filename = file.path(outDir_inHouse6_scater_outlier, "GraphCluster_hull1.pdf"), device = "pdf", width = 7, height = 5, dpi = 300)
p_hull_2 <-
  ggplot(umap.df.int, aes(x = UMAP_1, y = UMAP_2)) +
  geom_mark_hull(
    aes(fill=GraphBased, label=GraphBased), 
    alpha=0.0,
    concavity=20, expand = unit(1, "mm"), radius = unit(1, "mm")) +
  geom_point(size = 2, aes(colour = GraphBased)) +
  scale_colour_manual(values = pal.YS_clus) +
  theme_void() +
  theme(legend.position = "none") +
  NULL
p_hull_2
ggsave(filename = file.path(outDir_inHouse6_scater_outlier, "GraphCluster_hull2.pdf"), device = "pdf", width = 7, height = 5, dpi = 300)

Proportion

Show proportion as density plot (FACS like)

p1 <-
  ggplot(
  umap.df.int,
  #dplyr::filter(umap.df.int, TreeCut_re == "k_2"),
  aes(UMAP_1, UMAP_2)) + 
  #geom_density_2d(aes(colour = Estage)) +
  geom_density_2d_filled(contour_var = "ndensity") + 
  facet_wrap(TreeCut_re ~ Estage, ncol=3) +
  theme_custom +
  NULL

umap.df.int_tmp <- umap.df.int %>%
  dplyr::mutate(TreeCut_re = as.character(TreeCut_re),
                Estage = as.character(Estage)) %>%
  dplyr::mutate(Estage = 
                  case_when(
                    TreeCut_re %in% c("k_1", "k_2","k_3", "k_4", "k_5")  ~ "E1",
                    TRUE ~ Estage))
p2 <-
  ggplot(
  umap.df.int_tmp,
  #dplyr::filter(umap.df.int, TreeCut_re == "k_2"),
  aes(UMAP_1, UMAP_2)) + 
  #geom_density_2d(aes(colour = Estage)) +
  geom_density_2d_filled(contour_var = "ndensity") + 
  facet_wrap(TreeCut_re ~ Estage, ncol=3) +
  theme_custom +
  geom_mark_hull(
    aes(fill=TreeCut_re, label=TreeCut_re), 
    alpha=0.5,
    concavity=10, expand = unit(2, "mm"), radius = unit(1, "mm")) +
  #scale_colour_manual(values = pal.YS_clus) +
  NULL

pdf(file.path(outDir_noLMP, "Density_Estage.pdf"), width = 9, height = 7)
p1
p2
dev.off()
## quartz_off_screen 
##                 2

Show proportion of cells in each cluster

meta = umap.df.int %>%
  dplyr::mutate(
    Cond_simple =   gsub("HE_Gfi1", "HE", condition),
    Cond_simple =   gsub("HE_Runx1", "HE", Cond_simple),
    Cond_simple =   gsub("HE_Kneg", "HE_Gfi1_Kneg", Cond_simple))

tmp.pal <- df.col_FACS_simp %>%
  dplyr::filter(FACS %in% meta$Cond_simple) #%>%
  #pull(Pal)

meta <- meta %>%
  dplyr::mutate(Cond_simple = factor(Cond_simple, levels=tmp.pal$FACS))

pal.YS <- c(
  
  rep("#4E79A7", 3), # Endo
  #rep("#F28E2B", 3), # Kit Neg
  rep("#D82526", 3), # Runx1 & GFi1
  #rep("#69B764", 3), # Gfi1
  rep("#7a0177", 2), # EMP
  rep("#f768a1", 2) # LMP
)

# Proportion by FACS
stat1 <- 
  meta %>% 
  dplyr::group_by(TreeCut_re, Cond_simple) %>% 
  dplyr::summarise(count=n()) %>% 
  dplyr::mutate(perc=(count/sum(count)*100)) %>%
  dplyr::mutate(eDay = gsub("YS_", "", Cond_simple),
                eDay = gsub("_.*", "", eDay),
                eDay = factor(eDay, levels = c("E9.0", "E9.5", "E10", "E10.5"))) 
## `summarise()` has grouped output by 'TreeCut_re'. You can override using the `.groups` argument.
ggplot(stat1, aes(x = TreeCut_re, y = perc, fill = Cond_simple, 
                  colour=eDay, linetype=eDay,
                  label = round(perc, 0))) +
  geom_bar(stat="identity", width = 0.5) +
  geom_text(size = 4, position = position_stack(vjust = 0.5)) +
  #scale_fill_manual(values = tmp.pal$Pal) +
  scale_fill_manual(values = pal.YS) +
  scale_colour_manual(values=c("black", "grey20", "grey50", "black")) +
  scale_linetype_manual(values=c("blank", "solid", "solid", "longdash")) +
  labs(x="Cluster", y="Percentage", fill="FACS") +
   theme(
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white"),
     legend.position = "right") + 
  #coord_flip() +
  NULL

# 75w 20h if flip, opposite if not 
#ggsave(filename = file.path(outDir_inHouse6_scater_outlier, "BarChart_cluster.pdf"), device = "pdf", width = 12, height = 5, dpi = 300)
ggsave(filename = file.path(outDir_noLMP, "BarChart_cluster.pdf"), device = "pdf", width = 6, height = 9, dpi = 300)

Show proportion of cells in each FACS

#meta = umap.df.int

tmp.pal <- df.col_FACS_simp %>%
  dplyr::filter(FACS %in% meta$Cond_simple) %>%
  pull(Pal)

# Proportion by FACS
stat1 <- 
  meta %>% 
  #dplyr::filter(TreeCut_re == "k_2") %>%
  dplyr::group_by(Cond_simple, TreeCut_re) %>% 
  dplyr::summarise(count=n()) %>% 
  dplyr::mutate(perc=(count/sum(count)*100)) %>%
  dplyr::mutate(eDay = gsub("YS_", "", Cond_simple),
                eDay = gsub("_.*", "", eDay),
                eDay = factor(eDay, levels = c("E9.0", "E9.5", "E10", "E10.5"))) 
## `summarise()` has grouped output by 'Cond_simple'. You can override using the `.groups` argument.
ggplot(stat1, aes(x = Cond_simple, y = perc, fill = TreeCut_re, 
                  #colour=condition,
                  label = round(perc, 0))) +
  geom_bar(stat="identity", width = 0.5) +
  geom_text(size = 4, position = position_stack(vjust = 0.5)) +
  #scale_fill_manual(values = tmp.pal) +
  scale_fill_manual(values = pal.YS_clust_simple) +
  scale_colour_manual(values=c(pal.YS)) +
  #scale_linetype_manual(values=c("blank", "solid", "solid", "longdash")) +
  labs(x="Cluster", y="Percentage", fill="FACS") +
   theme(
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white"),
     axis.text.x= element_text(angle = 45),
     legend.position = "right") + 
  #coord_flip() +
  NULL

# 75w 20h if flip, opposite if not 
ggsave(filename = file.path(outDir_noLMP, "BarChart_FACS.pdf"), device = "pdf", width = 12, height = 9, dpi = 300)

Proportion as bar chart, not stacked

ggplot(stat1, aes(x = TreeCut_re, y = perc, fill = TreeCut_re,
                  label = round(perc, 0))) +
  geom_bar(stat="identity", width = 0.5) +
  facet_wrap(~Cond_simple) +
  #geom_text(size = 4, position = position_stack(vjust = 0.5)) +
  geom_text(size = 4) +
  scale_fill_manual(values = pal.YS_clust_simple) +
  labs(x="Cluster", y="Percentage", fill="Cluster") +
   theme(
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white"),
     legend.position = "right") + 
  #coord_flip() +
  NULL

ggsave(filename = file.path(outDir_noLMP, "BarChart_facet_FACS.pdf"), device = "pdf", width = 12, height = 8, dpi = 300)

Marker gene

We stick with Tree based clusters

We find the marker gene in each clusters. Seurat is a more suitable approach to find genes associated with clusters

inhouse.combined <- merge(seuset_nova, y = seuset_next, add.cell.ids = c("nova", "next"), project = "inhouse")

Do normal processing

seuset <- inhouse.combined
seuset <- seuset[,seuset$Barcode %in% se$Barcode ]

# Match the order
dd <- seuset[[]]
cD <- as.data.frame(colData(se)) %>%
  dplyr::select(Barcode,TreeCut_re, GraphBased )

dd <- dd %>%
  dplyr::left_join(cD)
## Joining, by = "Barcode"
seuset$GraphBased <- factor(dd$GraphBased, levels=paste0("g_", sprintf("%02d",1:length(unique(dd$GraphBased)))))
seuset$TreeCut_re <- factor(dd$TreeCut_re, levels=paste0("k_", sprintf("%01d",1:length(unique(dd$TreeCut_re)))))

Do normal processing

seuset <- NormalizeData(seuset)
# Feature selections
seuset <- 
  FindVariableFeatures(object = seuset, selection.method = "vst", nfeatures = 2500)
# Scaling the data
all.genes <- rownames(x = seuset)
seuset <- CellCycleScoring(seuset, s.features = s.genes, g2m.features = g2m.genes, set.ident = FALSE)
## Warning: The following features are not present in the object: MCM5, PCNA,
## TYMS, FEN1, MCM7, MCM4, RRM1, UNG, GINS2, MCM6, CDCA7, DTL, PRIM1, UHRF1, CENPU,
## HELLS, RFC2, POLR1B, NASP, RAD51AP1, GMNN, WDR76, SLBP, CCNE2, UBR7, POLD3,
## MSH2, ATAD2, RAD51, RRM2, CDC45, CDC6, EXO1, TIPIN, DSCC1, BLM, CASP8AP2, USP1,
## CLSPN, POLA1, CHAF1B, MRPL36, E2F8, not searching for symbol synonyms
## Warning: The following features are not present in the object: HMGB2, CDK1,
## NUSAP1, UBE2C, BIRC5, TPX2, TOP2A, NDC80, CKS2, NUF2, CKS1B, MKI67, TMPO, CENPF,
## TACC3, PIMREG, SMC4, CCNB2, CKAP2L, CKAP2, AURKB, BUB1, KIF11, ANP32E, TUBB4B,
## GTSE1, KIF20B, HJURP, CDCA3, JPT1, CDC20, TTK, CDC25C, KIF2C, RANGAP1, NCAPD2,
## DLGAP5, CDCA2, CDCA8, ECT2, KIF23, HMMR, AURKA, PSRC1, ANLN, LBR, CKAP5, CENPE,
## CTCF, NEK2, G2E3, GAS2L3, CBX5, CENPA, not searching for symbol synonyms
## Warning in AddModuleScore(object = object, features = features, name = name, :
## Could not find enough features in the object from the following feature lists:
## S.Score Attempting to match case...Could not find enough features in the object
## from the following feature lists: G2M.Score Attempting to match case...
# Add either originated from nova or next
Seq_name <- colnames(seuset)
Seq_name <- gsub("_.*", "", Seq_name)
seuset$Sequencer <- Seq_name

# Dont need to regress anything at this stage as we only want to remove the tech rep and outlier
seuset <- ScaleData(seuset, 
                    #vars.to.regress = c("S.Score", "G2M.Score", "Sequencer"), 
                    features = rownames(seuset))
## Centering and scaling data matrix
seuset <- RunPCA(seuset, verbose = FALSE)
seuset <- RunUMAP(seuset, dims = 1:30, verbose = FALSE, min.dist = 0.4)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
seuset <- FindNeighbors(seuset, dims = 1:30, verbose = FALSE)
seuset <- FindClusters(seuset, verbose = FALSE)

Show the UMAP plot

DimPlot(seuset)

ggsave(filename = file.path(outDir_noLMP, "Seurat_UMAP_clusterSeu.pdf"), device = "pdf", width = 7, height = 5, dpi = 300)

Another one

Idents(seuset) <- seuset$TreeCut_re
DimPlot(seuset)

ggsave(filename = file.path(outDir_noLMP, "Seurat_UMAP_clusterTreeCut.pdf"), device = "pdf", width = 7, height = 5, dpi = 300)
dd2 <- dd %>%
  mutate(seurat_clusters = seuset$seurat_clusters)
row.names(dd2) <- dd2$Barcode
dd2 <- dd2[se$Barcode,]

se$seurat_clusters <- dd2$seurat_clusters

Plot

umap.df.int$seurat_clusters <- dd2$seurat_clusters

p2 <-        
  ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=seurat_clusters)) +
  geom_point() +
  gghighlight(use_direct_label = FALSE) +
  theme(legend.position = "right") +
  #facet_wrap(~ ClusGroup, ncol=3) +
  theme_custom +
  scale_colour_manual(values=pal.YS_clus) + 
  #scale_colour_manual(values=tableau20) + 
  labs(title="Highlighting the cluster")


# Add text to the plot
clust_data <- umap.df.int %>%
  group_by(seurat_clusters) %>%
  dplyr::summarise(UMAP_1 = mean(UMAP_1),
                   UMAP_2 = mean(UMAP_2))

p2.1 <- p2 +
  geom_point(data = clust_data, aes(fill = seurat_clusters),
                 size = 10, shape = 21, colour = "white") +
      scale_fill_manual(values = pal.YS_clus) +
      geom_text(data = clust_data, aes(label = seurat_clusters),
                colour = "black") +
  theme(legend.position = "none") 

p2.1

ggsave(filename = file.path(outDir_noLMP, "UMAP_SeuratCluster.pdf"), device = "pdf", width = 7, height = 6, dpi = 300)

Find marker genes

su.markers <- 
  FindAllMarkers(seuset, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster k_1
## Calculating cluster k_2
## Calculating cluster k_3
## Calculating cluster k_4
## Calculating cluster k_5
su.markers <- su.markers %>%
  dplyr::mutate(isCS = ifelse(gene %in% df_cs$gene_name, "CS", "notCS"))

write.csv(su.markers, file.path(outDir_noLMP, "SeuratMarkers_TreeCut.csv"))

Plot a dot plot of the top genes

top10 <- su.markers %>% dplyr::group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)

pdf(file.path(outDir_noLMP, "DotPlot.pdf"), height = 12, width = 9)
DotPlot(seuset,features=unique(top10$gene)) + coord_flip()  + 
  theme(axis.text.y = element_text(size = 9)) +
  scale_colour_gradient2(low = "#FF00FF", mid = "#000000", high = "#FFFF00") 
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
dev.off()
## quartz_off_screen 
##                 2
pdf(file.path(outDir_noLMP, "Heatmap.pdf"), height = 12, width = 15)
DoHeatmap(seuset,features=unique(top10$gene), raster = FALSE)  
  #theme(axis.text.y = element_text(size = 9)) +
  #scale_colour_gradient2(low = "#FF00FF", mid = "#000000", high = "#FFFF00") 
dev.off()
## quartz_off_screen 
##                 2

Specific DE

su.markers2 <- 
  FindMarkers(seuset, only.pos = FALSE, 
              ident.1 = c("k_4"), # EMP
              ident.2 = c("k_5"), # LMP
              min.pct = 0.25, logfc.threshold = 0.25) %>%
  tibble::rownames_to_column(var = "gene") %>%
  dplyr::mutate(isCS = ifelse(gene %in% df_cs$gene_name, "CS", "notCS"))

write.csv(su.markers2, file.path(outDir_noLMP, "SeuratMarkers_k4_EMP_vs_k6_LMP.csv"))

Violin plot

Lets view the expression of selected genes

# Plot kit expression
plotUMAP(se, colour_by="Kit") +
  scale_fill_viridis(option = "plasma", na.value = "grey80", begin = 0, end = 0.9)

se$Cond_fac <- umap.df.int$Cond_fac

expInt <- assay(se, "logcounts")["Kit",]
m <- as.data.frame(expInt)


df_plot <- umap.df.int %>%
  mutate(Gene = m$expInt) %>%
  mutate(xLab = paste(Estage_fac, Cond_fac, sep="_"),
         xLab = factor(xLab, levels=
                         c("g1_Endo", "g2_Endo", "g3_Endo",
                           #"g1_Kneg", "g2_Kneg", "g3_Kneg",
                           "g1_HE", "g2_HE", "g3_HE",
                           "g2_EMP", "g3_EMP",
                           "g2_LMP", "g3_LMP"
                           )))
pal.YS <- c(
  
  rep("#4E79A7", 3), # Endo
  #rep("#F28E2B", 3), # Kit Neg
  rep("#D82526", 3), # Runx1 & GFi1
  #rep("#69B764", 3), # Gfi1
  rep("#7a0177", 2), # EMP
  rep("#f768a1", 2) # LMP
)

p_viol_gene <- 
  df_plot %>%
  ggplot(aes(x = xLab, y = Gene, fill = Cond_fac)) +
  geom_violin(alpha = 0.5, scale = "width", position = position_dodge(width = 0.9)) +
  geom_boxplot(alpha = 0.5, width = 0.2, position = position_dodge(width = 0.9)) +
  scale_fill_manual(values=unique(pal.YS)) +
  theme_bw() +
  theme(legend.position = "none",
        panel.grid.major = element_line(linetype = "blank"), 
        panel.grid.minor = element_line(linetype = "blank"),
        axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_y_continuous(limits = c(0,12),
                     breaks = c(0,5,10)) +
  xlab("") +
  NULL

pdf(file.path(outDir_noLMP, "Kit_exp.pdf"),height = 3, width = 6)
p_viol_gene
dev.off()
## quartz_off_screen 
##                 2

Nicer UMAP plot

geneDir <- file.path(outDir_noLMP, "gene_exp")
dir.create(geneDir)
## Warning in dir.create(geneDir): '/Users/zfadlullah/Dropbox (The University
## of Manchester)/zaki/PhD/sequencing_runs/2021/2021_06_01_MZ_016/output/
## Chapter_03_InHouse_YS/Scater_analysis_YS_only/no_Platelet/gene_exp' already
## exists
gg <- "Kit"
expInt <- assay(se, "logcounts")[gg,]
#exp2 <- sacle(expInt)
#exp2 <- as.vector(exp2)
df_plot <- umap.df.int %>%
  mutate(gene = expInt)

ggplot(df_plot, aes(x=UMAP_1, y=UMAP_2, colour=gene)) +
  geom_point() +
  theme_custom +
  theme(axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank(),
        legend.position = "none") +
  scale_colour_viridis(option = "plasma", na.value = "grey80", begin = 0, end = 1) +
  NULL

ggsave(filename = file.path(geneDir, paste0(gg, "_UMAP.png")), device = "png", width = 6, height = 4, dpi = 300)

Multiple genes

We show violin plot of mulitple genes

e make a violin plot of genes expressed in each cluster

geneToPlot <- c(

  "Runx1",
  "Gfi1",
  
  "Gfi1b",
  "Itga2b",
  "Itgb3",
  "Ptprc",
  
  "Epor",
  "Mpl",
  "Rgs18",
  "Pf4",
  
  
  "Mpo",
  "Il7r")


geneToPlot <- c(

  "Lyve1",

  "Runx1",
  "Gfi1",
  
  "Gfi1b",
  #"Rgs18",
  "Itga2b",
  "Itgb3",
  "Ptprc",
  
  "Il7r",
  "Ighm",
  
  "Epor",
  "Mpl",
  "Pf4")

Lets try to make a nice violin plot of these genes

setmp <- se
setmp$finalCluster2 <- setmp$TreeCut_re
pal <- pal.YS_clust_simple
expInt <- assay(setmp, "logcounts")[geneToPlot,]
m <- as.data.frame(expInt)
colnames(m) <- setmp$Barcode
m$gene <- row.names(m)
m <- melt(m)
## Using gene as id variables
m$gene <- factor(m$gene, levels=geneToPlot)
dftmp <- as.data.frame(colData(setmp)) 
cD_sub <- dftmp %>%
  dplyr::select(Barcode, finalCluster2) # %>%
#dplyr::filter(cell_cycle %in% "G1")
m <- m %>%
  dplyr::left_join(cD_sub, by = c("variable" = "Barcode"))
# Plot
p_viol_gene <- 
  m %>%
  ggplot(aes(x = finalCluster2, y = value, colour = finalCluster2)) +
  geom_violin(fill=NA, scale="width") +
    geom_quasirandom(groupOnX=TRUE, size = 0.8, alpha = 0.5, width = 0.4) +
    stat_summary(fun.y = mean, fun.ymin = mean, fun.ymax = mean,
                 geom = "crossbar", width = 0.3, alpha = 0.8, colour="black") +

  facet_grid(gene ~ .) +
  scale_colour_manual(values=pal) +
  theme_bw() +
  theme(legend.position = "none",
        panel.grid.major = element_line(linetype = "blank"), 
        panel.grid.minor = element_line(linetype = "blank"),
        axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_y_continuous(limits = c(0,15),
                     breaks = c(0,5,10, 15)) +
  xlab("") +
  NULL
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: `fun.ymin` is deprecated. Use `fun.min` instead.
## Warning: `fun.ymax` is deprecated. Use `fun.max` instead.
# 45w 77h
ggsave(filename = file.path(outDir_noLMP, "SelectedGenes.pdf"), device = "pdf", width = 6, height = 14, dpi = 300)

Not RUN

If we want to show numb of expressing cells

  # Find number of cells expressing in each group and write it on the top of the plot
df_g <- m %>%
    group_by(finalCluster2, gene) %>%
    dplyr::summarise(Ncell = n(),
              Nexp = sum( value >0 )) %>%
    dplyr::mutate(PecExp = round(Nexp / Ncell * 100, 1),
                  #Text = paste0(Nexp, " (", PecExp, "%", ")")) 
                  Text = paste0(" (", PecExp, "%", ")")) 
                  #Text = paste0(Nexp, "/", Ncell, "=", PecExp, "%")) 

p_viol_gene2 <-
   m %>%
  ggplot(aes(x = finalCluster2, y = value, colour = finalCluster2)) +
  geom_violin(fill=NA, scale="width") +
    geom_quasirandom(groupOnX=TRUE, size = 2.2, alpha = 0.7, width = 0.4) +
    stat_summary(fun.y = mean, fun.ymin = mean, fun.ymax = mean,
                 geom = "crossbar", width = 0.3, alpha = 0.8, colour="black") +

  facet_grid(gene ~ .) +
  scale_colour_manual(values=pal) +
  theme_bw() +
  theme(legend.position = "none",
        panel.grid.major = element_line(linetype = "blank"), 
        panel.grid.minor = element_line(linetype = "blank"),
        axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_y_continuous(limits = c(0,15),
                     breaks = c(0,5,10, 15)) +
  xlab("") +
  xlab("") +
  geom_text(data=df_g ,
              aes(x = finalCluster2, y = max(m$value + 1), label=Text),
              color="black", size = 4) +
  NULL

pdf(file.path(outDir_DE, "MarkerGenes_perCluster_v4.pdf"), height = 21.6, width = 19.6)
p_viol_gene2
dev.off()

Index correlation

We want to see if we can say antyhing about the index sort values. Subset to only the interested YS populations

cD <- as.data.frame(colData(se))

# Join with index FACS info
dd <- cD %>%
  mutate(UniqID = paste(SC_project, SC_plate, FACS_well, sep="_")) %>%
  left_join(df_indx)


# Add a column to simplify the annotation
dd <- dd %>%
  mutate(
    MainPop = gsub("AGM_", "", condition),
    MainPop = gsub("YS_", "", condition),
    
    MainPop = gsub("HE_Gfi1", "HE", MainPop),
    MainPop = gsub("HE_Runx1", "HE", MainPop)
  )


umap.df.int <- umap.df.int %>%
  dplyr::mutate(Barcode = se$Barcode)

dd_sub <- dd %>%
  dplyr::filter(condition %in% c("YS_E9.0_HE_Runx1", "YS_E9.5_HE_Runx1", "YS_E10.5_HE_Runx1",
                                 "YS_E9.0_HE_Gfi1", "YS_E9.5_HE_Gfi1", "YS_E10.5_HE_Gfi1")) %>%
  #dplyr::filter(Allele == "RG") %>%
  mutate(lRUNX1b = log10(Runx1b_RFP + 200) ,
         lGfi1 = log10(Gfi1_GFP+ 200) ) %>%
  mutate(MainPop = factor(MainPop, levels=c("E9.0_HE", "E9.5_HE", "E10.5_HE"))) %>%
  dplyr::left_join(dplyr::select(umap.df.int, UMAP_1, UMAP_2, Barcode))

Plot RFP in UMAP

gg <- "RFP"
expInt <- data.frame(
  gene = assay(se, "logcounts")[gg,],
  Barcode = se$Barcode)

df_plot <- dd_sub %>%
  left_join(expInt)

ggplot(df_plot, aes(x=UMAP_1, y=UMAP_2, colour=gene)) +
  geom_point() +
  theme_custom +
  theme(axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank(),
        legend.position = "none") +
  scale_colour_viridis(option = "plasma", na.value = "grey80", begin = 0, end = 1) +
  NULL
p2 <-
ggplot(dd_sub, aes(y=(lRUNX1b), x=(lGfi1), colour=Allele)) +
  geom_point(alpha = 0.5, size=3) +
  #gghighlight(use_direct_label = FALSE) +
  theme(legend.position = "right") +
  facet_wrap(~MainPop) +
  theme_custom +
  geom_vline(xintercept = 3.5, linetype = "dashed") +
  geom_hline(yintercept = 3, linetype = "dashed") +
  #scale_colour_manual(values=c("#00B050", "#FF006D")) +
  scale_colour_manual(values=c("black", "firebrick")) +
  NULL


pdf(file.path(outDir_inHouse6_scater_outlier, "INDEX_corelatiion.pdf"), height=4, width = 9)
p2
dev.off()

Extract the cells with really high Gfi1 or Runx1

dd_sub <- dd_sub %>%
  dplyr::mutate(FACSgate  = case_when(
    lRUNX1b >= 3.5 & lGfi1 >= 4  & condition == "YS_E10.5_HE_Gfi1" ~ "High",
    lRUNX1b <= 3.2 & lGfi1 <= 3.2 ~ "Low",
    TRUE ~ "notAssigned")) 


ggplot(dd_sub, aes(y=(lRUNX1b), x=(lGfi1), colour=FACSgate)) +
  geom_point(alpha = 0.5, size=3) +
  #gghighlight(use_direct_label = FALSE) +
  theme(legend.position = "right") +
  facet_wrap(~MainPop) +
  theme_custom +
  geom_vline(xintercept = 4, linetype = "dashed") +
  geom_hline(yintercept = 3.5, linetype = "dashed") +
  geom_vline(xintercept = 3.2, linetype = "dashed", colour="red") +
  geom_hline(yintercept = 3.2, linetype = "dashed", colour="red") +
  NULL

ggplot(dd_sub, aes(y=(lRUNX1b), x=(lGfi1), colour=condition)) +
  geom_point(alpha = 0.5, size=3) +
  #gghighlight(use_direct_label = FALSE) +
  theme(legend.position = "right") +
  facet_wrap(~MainPop) +
  theme_custom +
  geom_vline(xintercept = 4, linetype = "dashed") +
  geom_hline(yintercept = 3.5, linetype = "dashed") +
  geom_vline(xintercept = 3.2, linetype = "dashed", colour="red") +
  geom_hline(yintercept = 3.2, linetype = "dashed", colour="red") +
  NULL

Plot where they are in UMAP

dd_sub2 <- dd_sub %>%
  dplyr::select(Barcode, FACSgate)



dd2 <- dd %>%
  left_join(dd_sub2) 


dd_dim <- reducedDim(se, "UMAP")
colnames(dd_dim) <- c("UMAP_1", "UMAP_2")


dd2 <- cbind(dd2, dd_dim)

ggplot(dd2, aes(x=UMAP_1, y=UMAP_2, colour=FACSgate)) +
    geom_point(size=3) +
    theme_custom +
  NULL

Save

Save the seObject with the cluster information

# Cells to exclude
toRemove <- c(toRemove, toRemove_nonLMP)

saveRDS(se, file.path(outData_dir, "se_YS_only_fullInfo.RDS"))
write.csv(umap.df.int, file.path(outDir_noLMP, "Metadata_YS_only.csv"))
saveRDS(toRemove, file.path(outData_dir, "Cells_to_exclude.csv"))

# Full image
save.image(file=file.path(outDir_noLMP, "Full_image.Rdata"))
#load(file=file.path(outDir_noLMP, "Full_image.Rdata"))

Session Info

sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Mojave 10.14
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] dynamicTreeCut_1.63-1       dendextend_1.15.1          
##  [3] scran_1.20.1                paletteer_1.3.0            
##  [5] ggforce_0.3.3               viridis_0.6.1              
##  [7] viridisLite_0.4.0           ggrepel_0.9.1              
##  [9] gghighlight_0.3.2           ggbeeswarm_0.6.0           
## [11] reshape2_1.4.4              dplyr_1.0.6                
## [13] patchwork_1.1.1             SeuratObject_4.0.4         
## [15] Seurat_4.0.6                scater_1.20.1              
## [17] ggplot2_3.3.4               scuttle_1.2.0              
## [19] SingleCellExperiment_1.14.1 SummarizedExperiment_1.22.0
## [21] Biobase_2.52.0              GenomicRanges_1.44.0       
## [23] GenomeInfoDb_1.28.0         IRanges_2.26.0             
## [25] S4Vectors_0.30.0            BiocGenerics_0.38.0        
## [27] MatrixGenerics_1.4.0        matrixStats_0.59.0         
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.1                reticulate_1.20          
##   [3] tidyselect_1.1.1          htmlwidgets_1.5.3        
##   [5] grid_4.1.0                BiocParallel_1.26.0      
##   [7] Rtsne_0.15                munsell_0.5.0            
##   [9] ScaledMatrix_1.0.0        codetools_0.2-18         
##  [11] ica_1.0-2                 statmod_1.4.36           
##  [13] future_1.21.0             miniUI_0.1.1.1           
##  [15] withr_2.4.2               colorspace_2.0-1         
##  [17] highr_0.9                 knitr_1.33               
##  [19] ROCR_1.0-11               tensor_1.5               
##  [21] listenv_0.8.0             labeling_0.4.2           
##  [23] GenomeInfoDbData_1.2.6    polyclip_1.10-0          
##  [25] farver_2.1.0              parallelly_1.26.0        
##  [27] vctrs_0.3.8               generics_0.1.0           
##  [29] xfun_0.24                 R6_2.5.1                 
##  [31] rsvd_1.0.5                locfit_1.5-9.4           
##  [33] isoband_0.2.4             concaveman_1.1.0         
##  [35] bitops_1.0-7              spatstat.utils_2.2-0     
##  [37] DelayedArray_0.18.0       assertthat_0.2.1         
##  [39] promises_1.2.0.1          scales_1.1.1             
##  [41] beeswarm_0.4.0            gtable_0.3.0             
##  [43] beachmat_2.8.0            globals_0.14.0           
##  [45] goftest_1.2-2             rlang_0.4.11             
##  [47] splines_4.1.0             lazyeval_0.2.2           
##  [49] spatstat.geom_2.2-0       prismatic_1.0.0          
##  [51] yaml_2.2.1                abind_1.4-5              
##  [53] httpuv_1.6.1              tools_4.1.0              
##  [55] ellipsis_0.3.2            spatstat.core_2.2-0      
##  [57] jquerylib_0.1.4           RColorBrewer_1.1-2       
##  [59] ggridges_0.5.3            Rcpp_1.0.7               
##  [61] plyr_1.8.6                sparseMatrixStats_1.4.0  
##  [63] zlibbioc_1.38.0           purrr_0.3.4              
##  [65] RCurl_1.98-1.3            rpart_4.1-15             
##  [67] deldir_0.2-10             pbapply_1.4-3            
##  [69] cowplot_1.1.1             zoo_1.8-9                
##  [71] cluster_2.1.2             magrittr_2.0.1           
##  [73] data.table_1.14.0         RSpectra_0.16-0          
##  [75] scattermore_0.7           lmtest_0.9-38            
##  [77] RANN_2.6.1                fitdistrplus_1.1-5       
##  [79] mime_0.10                 evaluate_0.14            
##  [81] xtable_1.8-4              gridExtra_2.3            
##  [83] compiler_4.1.0            tibble_3.1.2             
##  [85] KernSmooth_2.23-20        V8_3.4.2                 
##  [87] crayon_1.4.1              htmltools_0.5.1.1        
##  [89] mgcv_1.8-35               later_1.2.0              
##  [91] tidyr_1.1.3               DBI_1.1.1                
##  [93] tweenr_1.0.2              MASS_7.3-54              
##  [95] Matrix_1.3-3              metapod_1.0.0            
##  [97] igraph_1.2.6              pkgconfig_2.0.3          
##  [99] plotly_4.9.4.1            spatstat.sparse_2.0-0    
## [101] vipor_0.4.5               bslib_0.2.5.1            
## [103] dqrng_0.3.0               XVector_0.32.0           
## [105] stringr_1.4.0             digest_0.6.27            
## [107] sctransform_0.3.2         RcppAnnoy_0.0.18         
## [109] spatstat.data_2.1-0       rmarkdown_2.9            
## [111] leiden_0.3.8              uwot_0.1.10              
## [113] edgeR_3.34.0              DelayedMatrixStats_1.14.0
## [115] curl_4.3.1                shiny_1.6.0              
## [117] lifecycle_1.0.0           nlme_3.1-152             
## [119] jsonlite_1.7.2            BiocNeighbors_1.10.0     
## [121] limma_3.48.0              fansi_0.5.0              
## [123] pillar_1.6.1              lattice_0.20-44          
## [125] fastmap_1.1.0             httr_1.4.2               
## [127] survival_3.2-11           glue_1.4.2               
## [129] FNN_1.1.3                 png_0.1-7                
## [131] bluster_1.2.1             stringi_1.6.2            
## [133] sass_0.4.0                rematch2_2.1.2           
## [135] BiocSingular_1.8.1        irlba_2.3.3              
## [137] future.apply_1.7.0